We currently assign sensitivities by tail-assignment, however, there may be cases with distinctive multimodal distributions that are indicative of different response models.
We'll begin by testing the method on beatAML data where we have much greater number of observations, and therefore more well defined distributions.
$$ p(x) = \frac{1}{\sqrt{ 2 \pi \sigma^2 }} e^{ - \frac{ (x - \mu)^2 } {2 \sigma^2} } $$%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
return false;
}
import pandas as pd
import numpy as np
from sklearn.mixture import GMM
import seaborn as sbn
from matplotlib import pyplot as plt
import warnings
# ------ for mut models ---------
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
import statsmodels.api as sm
from sklearn import metrics
warnings.filterwarnings("ignore")
aml_aucs = pd.read_csv('./../data/beatAML_aucs.csv')[['inhibitor','lab_id','auc']].drop_duplicates()
#aml_aucs = pd.read_csv('./../data/beatAML_AUCs_subset.csv')
aml_aucs.head()
def add_normal_plot(mu, s, weight, c, ax, auc_max=300):
'''
mu = mean
s = standard deviation
ax = matplotlib axes to add to
c = color {[r b g c y ...]} <str>
'''
x = np.arange(0, auc_max, 1)
y = (1/(2*np.pi*s**2)**(0.5))*np.exp( - (x-mu)**2/(2*s**2) ) * weight
ax = ax.plot(x,y, color=c, label='mean: %.1f, std: %.1f' %(mu, s))
def get_color():
'''
'''
for c in ['r','b','g','c','y']:
yield c
gen = get_color()
gen.__next__()
def test_multimodal_fits(X, ntests=10, kmax=5, plot=True, inhib=None, override_k=False):
'''
'''
res = {x:[] for x in ['k', 'aic', 'bic']}
for k in range(1,kmax):
for i in range(ntests):
#print('k: %d' %k)
gmm = GMM(n_components=k, n_init=1)
gmm.fit(X)
res['k'].append( k )
res['aic'].append( gmm.aic(X) )
res['bic'].append( gmm.bic(X) )
res = pd.DataFrame( res )
best_k = res[res.bic == np.min(res.bic)].k.unique()[0] if not override_k else override_k
gmm_best = GMM(n_components=best_k,n_init=20)
gmm_best.fit(X)
P = gmm_best.predict(X)
if plot:
nbins = 50
bin_ = np.arange(0,np.max(X),np.max(X)/nbins)
scalar_to_make_pretty = 0.25 # since our fitted Gaussians are normalized to their weights, they appear smaller
weights_ = scalar_to_make_pretty/len(X)
f, axs = plt.subplots(1,3,figsize=(15,5))
sbn.distplot(X, bins=bin_, ax=axs[0]).set_title('AUC distribution')
sbn.scatterplot(x='k', y='bic', alpha=0.3, data=res, ax=axs[1]).set_title('BIC vs K')
clas = 0
for weight, mean, covars, c in zip(gmm_best.weights_, gmm_best.means_, gmm_best.covars_, get_color()):
sbn.distplot(X[P==clas], bins=bin_, kde=False, color=c, ax=axs[2], label='AUC', hist_kws={'weights': [weights_]*len(X[P==clas])})
add_normal_plot(mean[0], (covars[0])**0.5, weight, c, axs[2], auc_max=np.max(X))
clas+=1
axs[2].set_title('Optimal GMM fit')
plt.legend()
plt.suptitle(inhib)
print('Number of assays (aucs): %d' %len(X))
print('Optimal K: %d [BIC=%.1f]' %(best_k, np.min(res.bic)))
print('GMM fit:\n\tMixture Weights: %r\n\tMeans: %r\n\tVariances: %r' %(gmm_best.weights_.ravel(), gmm_best.means_.ravel(), gmm_best.covars_.ravel()))
print('Class counts: %r' %['class %d: %d' %(cl, len(X[P==cl])) for cl in list(set(P))])
plt.show()
return P
for inhib in aml_aucs.inhibitor.unique():
inhib_dat = aml_aucs[aml_aucs.inhibitor == inhib]
if inhib_dat.shape[0] > 400:
print('------------------------------------------------')
print('Inhibitor: %s' %inhib)
print('------------------------------------------------')
AUCS = inhib_dat.auc.values.reshape(-1,1)
test_multimodal_fits(AUCS, ntests=10, kmax=6, inhib=inhib, plot=True)
okay_data = ['YM-155', 'Vandetanib (ZD6474)', 'Trametinib (GSK1120212)', 'Sunitinib','Sorafenib', 'Selumetinib (AZD6244)', 'Nilotinib', 'JAK Inhibitor I', 'Pazopanib (GW786034)', 'Elesclomol', 'Dasatinib']
dat = aml_aucs[aml_aucs.inhibitor.isin(okay_data)]
dat.to_csv('./../data/beatAML_AUCs_subset.csv')
The next step is to ask, can we model these drug responses across cancer types? As in, are the distributions that are well defined by the beatAML similarly define the HNSCC distribution?
We will try to test this by setting our null hypothesis as:
Let X be the underlying set distribution such that: $$ AUC_{beatAML} \in X $$
$$ H_0: AUC_{HNSCC} \in X $$and $$ H_1: AUC_{HNSCC} \notin X $$
We will use permutation testing, and reject cases where alpha=0.05.
HNSCC_all = pd.read_csv('./../data/HNSCC_all_functional_data.csv')
HNSCC_auc = HNSCC_all[['lab_id','inhibitor','auc','call']].drop_duplicates()
HNSCC_auc.head()
AML_Dasatinib = aml_aucs[aml_aucs['inhibitor'] == 'Sorafenib']
print('Number of AML Sorafenib assays: %d' %len(AML_Dasatinib))
AML_Dasatinib.head()
HNSCC_Dasatinib = HNSCC_auc[HNSCC_auc['inhibitor'] == 'Sorafenib'].drop_duplicates()
print('Number of HNSCC Sorafenib assays: %d' %len(HNSCC_Dasatinib))
HNSCC_Dasatinib.head()
print('------------------------------------------------')
print('Inhibitor: %s' %'AML_Dasatinib')
print('------------------------------------------------')
aml_=AML_Dasatinib.auc.values.reshape(-1,1)
c = test_multimodal_fits(aml_, ntests=10, kmax=6, inhib=inhib, plot=True)
print('------------------------------------------------')
print('Inhibitor: %s' %'HNSCC_Dasatinib')
print('------------------------------------------------')
hnscc_=100*HNSCC_Dasatinib.auc.values.reshape(-1,1)
c = test_multimodal_fits(hnscc_, ntests=10, kmax=6, inhib=inhib, plot=True)
nbins=50
bin_ = np.arange(0,300,300/nbins)
plt.figure()
plt.hist(aml_,color='blue',label='AML_AUC',normed=True,alpha=0.4,bins=bin_)
plt.hist(hnscc_,color='red',label='HNSCC_AUC',normed=True,alpha=0.4,bins=bin_)
plt.legend()
plt.show()
def permutation_test(x, y, n=1e5, verbose=True, return_prob=True, alpha=0.05):
'''
two tailed permutation test to see if y is apart of same model as x
returns the probability that y is not drawn from the same underlying model (or True if prob > 0.05 and return_prob=True)
'''
delta_mean = np.abs( np.mean(x) - np.mean(y) )
permutation_means = []
for i in range(int(n)):
if (i % 333 == 0) and verbose: print('Running permutations...[%.2f%%]' %(i/n*100), end='\r')
perms = np.random.permutation(np.append(x,y))
perm_y = perms[:len(y)]
perm_x = perms[len(y):]
permutation_means.append(np.abs(np.mean(perm_y) - np.mean(perm_x)))
pval = np.sum( permutation_means >= delta_mean ) / len( permutation_means )
return pval if return_prob else pval >= alpha
pval = permutation_test(aml_, hnscc_, n=1e6, verbose=True, return_prob=True)
print('Dasatinib permutation test p-value: %f' %pval)
aml_var_all = pd.read_csv('./../data/vizome_beatAML_variants.csv', skiprows=[0,1,2])
#[aml_var_all.tumor_only == True]
aml_var = aml_var_all.dropna()[['sample_id', 'gene', 'chr','start','end','ref','alt','variant','sift']]
aml_var = aml_var.rename(columns={'sample_id':'lab_id'})
print(aml_var.head())
print(aml_var[['chr','start','end','ref','alt']].head())
print('Number of genes mutated: %d' %len(aml_var.gene.unique()))
print('Number of unique mutations: %d' %aml_var[['chr','start','end','ref','alt']].drop_duplicates().shape[0])
mut_cnt = aml_var.groupby('gene').count()['lab_id'].sort_values()
mult_patients = mut_cnt.index[mut_cnt > 20]
#aml_variants = aml_var[aml_var['lab_id'].isin(mult_patients)]
shared_mutations = [x for x in mult_patients]
print(shared_mutations)
print(aml_var.groupby('gene').count()['lab_id'].sort_values())
aml_var = aml_var[aml_var['gene'].isin(shared_mutations)]
aml_var.head()
# convert variants to matrix format
genes = pd.Series( aml_var.gene.unique() )
res = []
for lab_id in aml_var.lab_id.unique():
D = aml_var[aml_var['lab_id'] == lab_id]
x = genes.isin( D['gene'] ).tolist()
res.append( [lab_id] + [str(int(xx)) for xx in x] )
res = pd.DataFrame( res, columns=['lab_id'] + genes.tolist() )
res.head()
MAT_VAR = res
print('matrix variant shape: %s' %str(res.shape))
print(MAT_VAR.head())
print( aml_aucs.head() )
aml_aucs['lab_id']=aml_aucs['lab_id'].astype(str)
MAT_VAR['lab_id']=MAT_VAR['lab_id'].astype(str)
aml_auc_w_var = aml_aucs.merge(right=MAT_VAR, how='inner', on=['lab_id'])
aml_auc_w_var.head(5)
Dasatinib = aml_auc_w_var[aml_auc_w_var['inhibitor'] == 'Dasatinib']
print('Dasatinib auc and variants shape: %s' %str(Dasatinib.shape))
aucs = Dasatinib.auc.values.reshape(-1,1)
X = Dasatinib[genes]
Y = test_multimodal_fits(aucs, inhib='Dasatinib', override_k=2)
X_embedded = TSNE(n_components=2).fit_transform(X)
df = pd.DataFrame( {'x1':X_embedded[:,0], 'x2':X_embedded[:,1], 'label':Y} )
plt.figure()
sbn.scatterplot(x='x1', y='x2', hue='label', data=df)
plt.show()
reg = ElasticNet().fit(X, Y)
yhat = reg.predict(X)
pred = (yhat > 0.5)*1
acc = sum([a == b for a,b in zip(pred, Y)]*1) / len(Y)
print('Elastic Net accuracy: %.2f' %acc)
fpr, tpr, thresholds = metrics.roc_curve(Y, yhat, pos_label=1)
EN_auc = metrics.auc(fpr, tpr)
print('Elastic Net AUC: %.2f' %EN_auc)
plt.figure()
plt.plot(fpr,tpr,'r--')
plt.show()
logit = LogisticRegression().fit(X,Y)
yhat = logit.predict(X)
pred = (yhat > 0.5)*1
print(pred)
acc = sum([a == b for a,b in zip(pred, Y)]*1) / len(Y)
print('Logistic Regression accuracy: %.2f' %acc)
fpr, tpr, thresholds = metrics.roc_curve(Y, yhat, pos_label=1)
EN_auc = metrics.auc(fpr, tpr)
print('Logistic Regression AUC: %.2f' %EN_auc)
plt.figure()
plt.plot(fpr,tpr,'r--')
plt.show()
print('---------------------------------------------------')
print('STATSMODEL LOGIT')
print('---------------------------------------------------')
X2 = X.values.astype(int)
logit_sm = sm.Logit(Y, sm.add_constant(X2)).fit(method='lbfgs', maxiter=500)
yhat = logit_sm.predict(sm.add_constant(X2))
cutoff = 0.9
pred = (yhat > cutoff)*1
acc = sum([a == b for a,b in zip(pred, Y)]*1) / len(Y)
print('Logistic Regression accuracy: %.2f' %acc)
fpr, tpr, thresholds = metrics.roc_curve(Y, yhat, pos_label=1)
EN_auc = metrics.auc(fpr, tpr)
print('Logistic Regression AUC: %.2f' %EN_auc)
plt.figure()
plt.plot(fpr,tpr,'r--')
plt.show()
print( logit_sm.summary() )
#print(yhat)
#build_RF_variant_model(X,Y)